library(tidyverse)
library(glue)
library(janitor)
library(caret)
library(lattice)
library(skimr)
library(latticeExtra)
library(lattice)
library(plotly)
library(ggthemr)
library(GGally)
ggthemr('light')
load('../cache/raw_train.rdata')
Hypotheses
H1 - Some cover types have large h distances
- Some covertypes have very large absolute values of distances. cottonwood has < 550.
- This could be an additional indicator variable. “h_dist_gt_500”
- Some binning and indicator variables might show promise
- Therea are also some linear structures in the data that perhaps can be exploited
raw_train %>%
dplyr::select(cover_type, contains("hydrology")) %>%
ggplot(aes(horizontal_distance_to_hydrology, vertical_distance_to_hydrology)) +
geom_point(aes(color = cover_type), alpha = 0.1, stroke = 0) +
geom_smooth() +
facet_wrap(~cover_type) +
theme(legend.position="none")

raw_train %>%
ggplot(aes(cover_type, horizontal_distance_to_hydrology))+
geom_jitter(alpha=0.05)+
geom_boxplot(fill=NA)+
geom_hline(yintercept = 0, color='gray')

. These plots show h_dist is either exactly 0 or there’s a positive number
densityplot(~horizontal_distance_to_hydrology|cover_type,raw_train, plot.points=F,bw = 10)

raw_train %>%
group_by(cover_type) %>%
summarise(max = max(horizontal_distance_to_hydrology), min = min(horizontal_distance_to_hydrology))
H2 - Some cover types have only positive v distances
- There seems to be some truth in this.
- cottonwood is absolutely almost around 0
- spruce and lodgepole have strong -ve components but not so much the other trees.
- perhaps an indicator variable would improve the model “v_dist_lt_0”
- also, some v_dsts are capped at ~270 (like ponderosa or aspen or douglasfir). Only lodgepole goes > 450. this is a definite predictor. we should have an indicator var like “v_dist_gt_450”.
- come to think of it… perhaps a binning strategy would benefit here, driving better predictive performance than nominal variable
raw_train %>%
dplyr::select(cover_type, contains("hydrology")) %>%
ggplot(aes(cover_type, vertical_distance_to_hydrology))+
geom_jitter(alpha=0.05)+
geom_hline(yintercept = 0, color='gray')

raw_train %>%
group_by(cover_type) %>%
summarise(max = max(vertical_distance_to_hydrology), min = min(vertical_distance_to_hydrology))
H3 - Direct distances add value
- Not sure that it does… although it might clean up the distributions a bit. we’ll have to experiment
raw_train <- raw_train %>%
mutate(dist = sqrt(horizontal_distance_to_hydrology^2+vertical_distance_to_hydrology^2))
package ‘bindrcpp’ was built under R version 3.4.4
raw_train %>%
ggplot(aes(cover_type, dist)) +
geom_jitter(alpha = 0.1, stroke = 0) +
geom_boxplot(fill=NA)

raw_train %>%
dplyr::select(cover_type, contains("hill")) %>%
ggpairs(aes(color=cover_type), alpha=0.2)

EDA Takeaways & Ideas
Observations
- {lodgepole:aspen}, {ponderosa:cottonwood} have similar distributions across soils
- no strong correlations among the soils
- clear distinctions among wilderness - {{spruce:lodgepole}:{aspen:krummholz}}, {ponderosa:douglasfir}, {cottonwood}
- Some strong interactions between wilderness & soil type and cover_type
- krummholz is strongly soils > 35
- Shade measures (esp9am) seem quite predictive
Ideas
- Interactions can be modeled explicitely for linear models or to boost accuracy of the heuristic models, especially some wilderness*soiltypes
- change hori and vertical distance to hydrology to a straight line distance
- lots of opportunities for indicator variables after binning continuous variables
- PCA to dimension reduce soil / hillside vars?
LS0tCnRpdGxlOiAiRURBIC0gUiBTYW5nb2xlIgpvdXRwdXQ6CiAgaHRtbF9ub3RlYm9vazoKICAgIGhpZ2hsaWdodDogemVuYnVybgogICAgdGhlbWU6IGZsYXRseQogICAgdG9jOiB5ZXMKICBwZGZfZG9jdW1lbnQ6CiAgICBmaWdfd2lkdGg6IDkKICAgIGhpZ2hsaWdodDogemVuYnVybgogICAgdG9jOiB5ZXMKLS0tCmBgYHtyIGxpYnJhcmllcywgbWVzc2FnZT1GQUxTRSwgd2FybmluZz1GQUxTRX0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoZ2x1ZSkKbGlicmFyeShqYW5pdG9yKQpsaWJyYXJ5KGNhcmV0KQpsaWJyYXJ5KGxhdHRpY2UpCmxpYnJhcnkoc2tpbXIpCmxpYnJhcnkobGF0dGljZUV4dHJhKQpsaWJyYXJ5KGxhdHRpY2UpCmxpYnJhcnkocGxvdGx5KQpsaWJyYXJ5KGdndGhlbXIpCmxpYnJhcnkoR0dhbGx5KQpnZ3RoZW1yKCdsaWdodCcpCmBgYApgYGB7ciBsb2FkX2RhdGF9CmxvYWQoJy4uL2NhY2hlL3Jhd190cmFpbi5yZGF0YScpCmBgYAoKIyBIeXBvdGhlc2VzCgojIyBIMSAtIFNvbWUgY292ZXIgdHlwZXMgaGF2ZSBsYXJnZSBoIGRpc3RhbmNlcwoKLSBTb21lIGNvdmVydHlwZXMgaGF2ZSB2ZXJ5IGxhcmdlIGFic29sdXRlIHZhbHVlcyBvZiBkaXN0YW5jZXMuIGNvdHRvbndvb2QgaGFzIDwgNTUwLiAKLSBUaGlzIGNvdWxkIGJlIGFuIGFkZGl0aW9uYWwgaW5kaWNhdG9yIHZhcmlhYmxlLiAiaF9kaXN0X2d0XzUwMCIKLSBTb21lIGJpbm5pbmcgYW5kIGluZGljYXRvciB2YXJpYWJsZXMgbWlnaHQgc2hvdyBwcm9taXNlCi0gVGhlcmVhIGFyZSBhbHNvIHNvbWUgbGluZWFyIHN0cnVjdHVyZXMgaW4gdGhlIGRhdGEgdGhhdCBwZXJoYXBzIGNhbiBiZSBleHBsb2l0ZWQKCmBgYHtyfQpyYXdfdHJhaW4gJT4lICAgIAogICAgZHBseXI6OnNlbGVjdChjb3Zlcl90eXBlLCBjb250YWlucygiaHlkcm9sb2d5IikpICU+JQogICAgZ2dwbG90KGFlcyhob3Jpem9udGFsX2Rpc3RhbmNlX3RvX2h5ZHJvbG9neSwgdmVydGljYWxfZGlzdGFuY2VfdG9faHlkcm9sb2d5KSkgKwogICAgZ2VvbV9wb2ludChhZXMoY29sb3IgPSBjb3Zlcl90eXBlKSwgYWxwaGEgPSAwLjEsIHN0cm9rZSA9IDApICsKICAgIGdlb21fc21vb3RoKCkgKwogICAgZmFjZXRfd3JhcCh+Y292ZXJfdHlwZSkgKwogICAgdGhlbWUobGVnZW5kLnBvc2l0aW9uPSJub25lIikKYGBgCgpgYGB7cn0KcmF3X3RyYWluICU+JSAgICAKICAgIGdncGxvdChhZXMoY292ZXJfdHlwZSwgaG9yaXpvbnRhbF9kaXN0YW5jZV90b19oeWRyb2xvZ3kpKSsKICAgIGdlb21faml0dGVyKGFscGhhPTAuMDUpKwogICAgZ2VvbV9ib3hwbG90KGZpbGw9TkEpKwogICAgZ2VvbV9obGluZSh5aW50ZXJjZXB0ID0gMCwgY29sb3I9J2dyYXknKQpgYGAKCi4gVGhlc2UgcGxvdHMgc2hvdyBoX2Rpc3QgaXMgZWl0aGVyIGV4YWN0bHkgMCBvciB0aGVyZSdzIGEgcG9zaXRpdmUgbnVtYmVyCgpgYGB7cn0KZGVuc2l0eXBsb3Qofmhvcml6b250YWxfZGlzdGFuY2VfdG9faHlkcm9sb2d5fGNvdmVyX3R5cGUscmF3X3RyYWluLCBwbG90LnBvaW50cz1GLGJ3ID0gMTApCmBgYAoKYGBge3J9CnJhd190cmFpbiAlPiUKICAgIGdyb3VwX2J5KGNvdmVyX3R5cGUpICU+JSAKICAgIHN1bW1hcmlzZShtYXggPSBtYXgoaG9yaXpvbnRhbF9kaXN0YW5jZV90b19oeWRyb2xvZ3kpLCBtaW4gPSBtaW4oaG9yaXpvbnRhbF9kaXN0YW5jZV90b19oeWRyb2xvZ3kpKQpgYGAKCgojIyBIMiAtIFNvbWUgY292ZXIgdHlwZXMgaGF2ZSBvbmx5IHBvc2l0aXZlIHYgZGlzdGFuY2VzCgotIFRoZXJlIHNlZW1zIHRvIGJlIHNvbWUgdHJ1dGggaW4gdGhpcy4KLSBjb3R0b253b29kIGlzIGFic29sdXRlbHkgYWxtb3N0IGFyb3VuZCAwCi0gc3BydWNlIGFuZCBsb2RnZXBvbGUgaGF2ZSBzdHJvbmcgLXZlIGNvbXBvbmVudHMgYnV0IG5vdCBzbyBtdWNoIHRoZSBvdGhlciB0cmVlcy4gCi0gcGVyaGFwcyBhbiBpbmRpY2F0b3IgdmFyaWFibGUgd291bGQgaW1wcm92ZSB0aGUgbW9kZWwgInZfZGlzdF9sdF8wIgotIGFsc28sIHNvbWUgdl9kc3RzIGFyZSBjYXBwZWQgYXQgfjI3MCAobGlrZSBwb25kZXJvc2Egb3IgYXNwZW4gb3IgZG91Z2xhc2ZpcikuIE9ubHkgbG9kZ2Vwb2xlIGdvZXMgPiA0NTAuIHRoaXMgaXMgYSBkZWZpbml0ZSBwcmVkaWN0b3IuIHdlIHNob3VsZCBoYXZlIGFuIGluZGljYXRvciB2YXIgbGlrZSAidl9kaXN0X2d0XzQ1MCIuIAotIGNvbWUgdG8gdGhpbmsgb2YgaXQuLi4gcGVyaGFwcyBhIGJpbm5pbmcgc3RyYXRlZ3kgd291bGQgYmVuZWZpdCBoZXJlLCBkcml2aW5nIGJldHRlciBwcmVkaWN0aXZlIHBlcmZvcm1hbmNlIHRoYW4gbm9taW5hbCB2YXJpYWJsZQoKYGBge3J9CnJhd190cmFpbiAlPiUgICAgCiAgICBkcGx5cjo6c2VsZWN0KGNvdmVyX3R5cGUsIGNvbnRhaW5zKCJoeWRyb2xvZ3kiKSkgJT4lCiAgICBnZ3Bsb3QoYWVzKGNvdmVyX3R5cGUsIHZlcnRpY2FsX2Rpc3RhbmNlX3RvX2h5ZHJvbG9neSkpKwogICAgZ2VvbV9qaXR0ZXIoYWxwaGE9MC4wNSkrCiAgICBnZW9tX2hsaW5lKHlpbnRlcmNlcHQgPSAwLCBjb2xvcj0nZ3JheScpCmBgYApgYGB7cn0KcmF3X3RyYWluICU+JQogICAgZ3JvdXBfYnkoY292ZXJfdHlwZSkgJT4lIAogICAgc3VtbWFyaXNlKG1heCA9IG1heCh2ZXJ0aWNhbF9kaXN0YW5jZV90b19oeWRyb2xvZ3kpLCBtaW4gPSBtaW4odmVydGljYWxfZGlzdGFuY2VfdG9faHlkcm9sb2d5KSkKYGBgCgojIyBIMyAtIERpcmVjdCBkaXN0YW5jZXMgYWRkIHZhbHVlCgotIE5vdCBzdXJlIHRoYXQgaXQgZG9lcy4uLiBhbHRob3VnaCBpdCBtaWdodCBjbGVhbiB1cCB0aGUgZGlzdHJpYnV0aW9ucyBhIGJpdC4gd2UnbGwgaGF2ZSB0byBleHBlcmltZW50CgpgYGB7cn0KcmF3X3RyYWluIDwtIHJhd190cmFpbiAlPiUgCiAgICBtdXRhdGUoZGlzdCA9IHNxcnQoaG9yaXpvbnRhbF9kaXN0YW5jZV90b19oeWRyb2xvZ3leMit2ZXJ0aWNhbF9kaXN0YW5jZV90b19oeWRyb2xvZ3leMikpCnJhd190cmFpbiAlPiUgCiAgICBnZ3Bsb3QoYWVzKGNvdmVyX3R5cGUsIGRpc3QpKSArCiAgICBnZW9tX2ppdHRlcihhbHBoYSA9IDAuMSwgc3Ryb2tlID0gMCkgKwogICAgZ2VvbV9ib3hwbG90KGZpbGw9TkEpCmBgYAoKCmBgYHtyIG1lc3NhZ2U9RkFMU0UsIHdhcm5pbmc9RkFMU0UsIGZpZy5oZWlnaHQ9NiwgZmlnLndpZHRoPTEwfQpyYXdfdHJhaW4gJT4lICAgIAogICAgZHBseXI6OnNlbGVjdChjb3Zlcl90eXBlLCBjb250YWlucygiaGlsbCIpKSAlPiUKICAgIGdncGFpcnMoYWVzKGNvbG9yPWNvdmVyX3R5cGUpLCBhbHBoYT0wLjIpCmBgYAoKIyBFREEgVGFrZWF3YXlzICYgSWRlYXMKCiMjIE9ic2VydmF0aW9ucwoKMS4ge2xvZGdlcG9sZTphc3Blbn0sIHtwb25kZXJvc2E6Y290dG9ud29vZH0gaGF2ZSBzaW1pbGFyIGRpc3RyaWJ1dGlvbnMgYWNyb3NzIHNvaWxzCjEuIG5vIHN0cm9uZyBjb3JyZWxhdGlvbnMgYW1vbmcgdGhlIHNvaWxzCjEuIGNsZWFyIGRpc3RpbmN0aW9ucyBhbW9uZyB3aWxkZXJuZXNzIC0ge3tzcHJ1Y2U6bG9kZ2Vwb2xlfTp7YXNwZW46a3J1bW1ob2x6fX0sIHtwb25kZXJvc2E6ZG91Z2xhc2Zpcn0sIHtjb3R0b253b29kfQoxLiBTb21lIHN0cm9uZyBpbnRlcmFjdGlvbnMgYmV0d2VlbiB3aWxkZXJuZXNzICYgc29pbCB0eXBlIGFuZCBjb3Zlcl90eXBlCjEuIGtydW1taG9seiBpcyBzdHJvbmdseSBzb2lscyA+IDM1CjEuIFNoYWRlIG1lYXN1cmVzIChlc3A5YW0pIHNlZW0gcXVpdGUgcHJlZGljdGl2ZQoKIyMgSWRlYXMKCjEuIEludGVyYWN0aW9ucyBjYW4gYmUgbW9kZWxlZCBleHBsaWNpdGVseSBmb3IgbGluZWFyIG1vZGVscyBvciB0byBib29zdCBhY2N1cmFjeSBvZiB0aGUgaGV1cmlzdGljIG1vZGVscywgZXNwZWNpYWxseSBzb21lIHdpbGRlcm5lc3Mqc29pbHR5cGVzCjEuIGNoYW5nZSBob3JpIGFuZCB2ZXJ0aWNhbCBkaXN0YW5jZSB0byBoeWRyb2xvZ3kgdG8gYSBzdHJhaWdodCBsaW5lIGRpc3RhbmNlCjEuIGxvdHMgb2Ygb3Bwb3J0dW5pdGllcyBmb3IgaW5kaWNhdG9yIHZhcmlhYmxlcyBhZnRlciBiaW5uaW5nIGNvbnRpbnVvdXMgdmFyaWFibGVzCjEuIFBDQSB0byBkaW1lbnNpb24gcmVkdWNlIHNvaWwgLyBoaWxsc2lkZSB2YXJzPwo=